Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(MultiAssayExperiment)
tmp <- fs::dir_map("R scripts/", source)
theme_set(theme_light())MultiAssayExperiment objectlibrary(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(MultiAssayExperiment)
tmp <- fs::dir_map("R scripts/", source)
theme_set(theme_light())crf_files <-
get_01_output_dir() |>
fs::dir_ls() |>
str_subset("/01_")
crf_clean_file <- crf_files |> str_subset("01_crf_clean") |> sort(decreasing = TRUE) |> magrittr::extract(1)
crf_raw_file <- crf_files |> str_subset("01_crf_raw") |> sort(decreasing = TRUE) |> magrittr::extract(1)
crf_dict_file <- crf_files |> str_subset("01_crf_plates_dictionary") |> sort(decreasing = TRUE) |> magrittr::extract(1)
load(crf_clean_file, verbose = TRUE)Loading objects:
crf_clean
load(crf_raw_file, verbose = TRUE)Loading objects:
crf_raw
load(crf_dict_file, verbose = TRUE)Loading objects:
crf_plates_dictionary
rm(crf_clean_file, crf_raw_file, crf_dict_file)Participants tableload(
crf_files |> str_subset("01_participants_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
participants
load(
crf_files |> str_subset("01_participants_variable_dictionary") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
participants_variable_dictionary
load(
crf_files |> str_subset("01_participant_crfs_merged") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
participant_crfs_merged
Visits tableload(
crf_files |> str_subset("01_visits_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
visits
load(
crf_files |> str_subset("01_visits_long_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
visits_long
load(
crf_files |> str_subset("01_visits_crfs_merged") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
visits_crfs_merged
visits |>
ggplot() +
aes(x = study_day, y = pid |> factor(), color = visit_code |> factor()) +
geom_point() We define the uid (unique identifier):
visits <-
visits |>
mutate(
uid = str_c(pid, visit_code, sep = "_")
) |>
select(uid, everything())Exposures tableload(
crf_files |> str_subset("01_exposures_") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
exposures
SummarizedExperiment objectsdata_dir <- get_01_output_dir()
se_files <- fs::dir_ls(data_dir, regexp = "se_.*\\.rds$") |> sort(decreasing = TRUE)mg_file <- se_files[grepl("mg", se_files)][1]
se_mg <- readRDS(mg_file)
se_mg <- check_se(se_mg)
se_mg# A SummarizedExperiment-tibble abstraction: 775,884 × 58
# Features=779 | Samples=996 | Assays=count, count_corr, rel_ab_all,
# rel_ab_bact, rel_ab
.feature .sample count count_corr rel_ab_all rel_ab_bact rel_ab uid mg_uid
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 C0022A1 0681000… 0 0 0 0 0 0681… MG_202
2 C0059E1 0681000… 0 0 0 0 0 0681… MG_202
3 C0175A1 0681000… 0 0 0 0 0 0681… MG_202
4 FF00018 0681000… 0 0 0 0 0 0681… MG_202
5 FF00051 0681000… 0 0 0 0 0 0681… MG_202
6 UC101 0681000… 0 0 0 0 0 0681… MG_202
7 122010 0681000… 0 0 0 0 0 0681… MG_202
8 185329 0681000… 0 0 0 0 0 0681… MG_202
9 C0006A1 0681000… 0 0 0 0 0 0681… MG_202
10 C0028A1 0681000… 0 0 0 0 0 0681… MG_202
# ℹ 40 more rows
# ℹ 49 more variables: sample_type <fct>, control_type <fct>,
# total_non_human_reads <dbl>, rel_ab_multi_genera <dbl>,
# poor_quality_mg_data <lgl>, cst <chr>, sub_cst <chr>, valencia_score <dbl>,
# swab_barcode <chr>, mg_pid <chr>, mg_visit_code <chr>,
# visit_code_flagged <lgl>, mg_sample_type <chr>, ext_lib_plate_nb <dbl>,
# ext_lib_plate_id <dbl>, ext_lib_position <chr>, …
qPCR_file <- se_files[grepl("pcr_agg", se_files)][1]
se_qPRC <- readRDS(qPCR_file)
se_qPRC <- check_se(se_qPRC)
se_qPRC# A SummarizedExperiment-tibble abstraction: 16,464 × 23
# Features=16 | Samples=1029 | Assays=n_non_na, dilution, copies_per_swab_med,
# copies_per_swab_mean, copies_per_swab_cv
.feature .sample n_non_na dilution copies_per_swab_med copies_per_swab_mean
<chr> <chr> <int> <dbl> <dbl> <dbl>
1 C0175A1 06810000… 3 5 0 0
2 C0022A1 06810000… 3 5 0 0
3 C0059E1 06810000… 3 5 0 0
4 UC101 06810000… 3 5 0 0
5 FF00018 06810000… 3 5 0 0
6 FF00051 06810000… 3 5 0 0
7 185329 06810000… 3 20 0 0
8 C0028A1 06810000… 3 20 0 0
9 C0112A1 06810000… 3 20 0 0
10 FF00004 06810000… 3 20 0 0
# ℹ 310 more rows
# ℹ 17 more variables: copies_per_swab_cv <dbl>, uid <chr>,
# qpcr_sample_type <fct>, sample_type <chr>, control_type <chr>,
# qpcr_sample_id <chr>, vibr_sample_id <chr>, ext_lib_plate_nb <chr>,
# target <fct>, fluor <chr>, strain_group_qpcr <chr>, taxon_label <chr>,
# LBP <fct>, strain_id <fct>, strain_origin <fct>, biose_id <chr>,
# pcr_plate_ids <chr>
amplicon_file <- se_files[grepl("16S_agg", se_files)][1]
se_16S <- readRDS(amplicon_file)
se_16S <- check_se(se_16S)
se_16S# A SummarizedExperiment-tibble abstraction: 3,365,223 × 48
# Features=817 | Samples=4119 | Assays=counts, rel_ab
.feature .sample counts rel_ab uid exclude_sample total_counts sample_type
<chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr>
1 Lactobac… 068100… 6904 0.0933 0681… No 74008 Clinical s…
2 Lactobac… 068100… 0 0 0681… No 74008 Clinical s…
3 Gardnere… 068100… 8284 0.112 0681… No 74008 Clinical s…
4 Ca_Lachn… 068100… 0 0 0681… No 74008 Clinical s…
5 Sneathia… 068100… 21143 0.286 0681… No 74008 Clinical s…
6 Fannyhes… 068100… 1601 0.0216 0681… No 74008 Clinical s…
7 Sneathia… 068100… 0 0 0681… No 74008 Clinical s…
8 Prevotel… 068100… 1963 0.0265 0681… No 74008 Clinical s…
9 Prevotel… 068100… 0 0 0681… No 74008 Clinical s…
10 f_Dethio… 068100… 3629 0.0490 0681… No 74008 Clinical s…
# ℹ 40 more rows
# ℹ 40 more variables: control_type <chr>, sample_id <chr>,
# sample_id_barcode <chr>, patient_id <chr>, visit_code_ps <chr>,
# idt_adapter_name <chr>, i5_index_forward <chr>, i5_index_reverse <chr>,
# i7_index_reverse <chr>, i7_index <chr>, sample_id_2 <chr>,
# total_spike_in_rel <dbl>, total_spike_in_counts <dbl>, cst <chr>,
# sub_cst <chr>, score <dbl>, specimen_type <chr>, well_id <chr>, …
luminex_file <- se_files[grepl("luminex_agg", se_files)][1]
se_luminex <- readRDS(luminex_file)
se_luminex <- check_se(se_luminex)
se_luminex# A SummarizedExperiment-tibble abstraction: 40,800 × 37
# Features=48 | Samples=850 | Assays=obs_conc, obs_conc_imp, value_type,
# value_types, sd_log10_obs_conc_imp, obs_conc_log10, obs_conc_imp_log10
.feature .sample obs_conc obs_conc_imp value_type value_types
<chr> <chr> <dbl> <dbl> <chr> <chr>
1 CTACK 068100004_0000 207. 207. Observed concentr… Observed c…
2 Eotaxin 068100004_0000 90.7 90.7 Observed concentr… Observed c…
3 FGF basic 068100004_0000 526. 526. Observed concentr… Observed c…
4 G-CSF 068100004_0000 767. 767. Observed concentr… Observed c…
5 GM-CSF 068100004_0000 64.2 64.2 Observed concentr… Observed c…
6 GRO-a 068100004_0000 3001. 3001. Observed concentr… Observed c…
7 HGF 068100004_0000 3805. 3805. Observed concentr… Observed c…
8 IFN-a2 068100004_0000 328. 328. Observed concentr… Observed c…
9 IFN-g 068100004_0000 4074. 4074. Observed concentr… Observed c…
10 IL-10 068100004_0000 33.2 33.2 Observed concentr… Observed c…
# ℹ 38 more rows
# ℹ 31 more variables: sd_log10_obs_conc_imp <dbl>, obs_conc_log10 <dbl>,
# obs_conc_imp_log10 <dbl>, uid <chr>, sample_id <chr>, visit <dbl>,
# weight <dbl>, volume <dbl>, approx_volume_based_on_aliquot_repos <dbl>,
# aliquot_volume <dbl>, dilution_200ul_tube <dbl>,
# volume_added_for_assay <dbl>, dilution_manifest <dbl>, dilution_new <dbl>,
# shorthand_title <chr>, alternate_title <chr>, …
flow_file <- se_files[grepl("flow", se_files)][1]
se_flow <- readRDS(flow_file)
se_flow <- check_se(se_flow)
se_flow# A SummarizedExperiment-tibble abstraction: 3,916 × 15
# Features=11 | Samples=356 | Assays=count, percentage
.feature .sample count percentage uid sample machine sample_type
<chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 Live 068100004_10… 2.16e6 NA 0681… Speci… 5R Clinical s…
2 Granulocytes 068100004_10… 2.98e4 1.38 0681… Speci… 5R Clinical s…
3 B_cells 068100004_10… 3.68e2 NA 0681… Speci… 5R Clinical s…
4 APC 068100004_10… 7.88e2 NA 0681… Speci… 5R Clinical s…
5 CD40 068100004_10… 4.06e2 51.5 0681… Speci… 5R Clinical s…
6 CD80 068100004_10… 3.82e2 48.5 0681… Speci… 5R Clinical s…
7 CD86 068100004_10… 4.92e2 62.4 0681… Speci… 5R Clinical s…
8 CD4 068100004_10… 4.13e3 NA 0681… Speci… 5R Clinical s…
9 CD8 068100004_10… 1.48e3 NA 0681… Speci… 5R Clinical s…
10 DblePosCD4 068100004_10… 1.58e2 3.82 0681… Speci… 5R Clinical s…
# ℹ 210 more rows
# ℹ 7 more variables: control_type <chr>, pid_assay <chr>,
# visit_code_assay <chr>, site_assay <chr>, cell_type <chr>,
# parent_cell_type <fct>, cell_type_label <fct>
MultiAssayExperiment objectassay_list <-
bind_rows(
tibble(
mae_assay_name = "mg",
se_object_name = "se_mg",
assay_label = "Metagenomic"
),
tibble(
mae_assay_name = "qPCR",
se_object_name = "se_qPRC",
assay_label = "qPCR"
),
tibble(
mae_assay_name = "amplicon",
se_object_name = "se_16S",
assay_label = "16S rRNA amplicon seq."
),
tibble(
mae_assay_name = "luminex",
se_object_name = "se_luminex",
assay_label = "Luminex"
),
tibble(
mae_assay_name = "flow",
se_object_name = "se_flow",
assay_label = "Flow cytometry"
)
) |>
mutate(
assay_label = assay_label |> fct_inorder()
)We will assemble the following tables (SE objects):
assay_list |> gt()| mae_assay_name | se_object_name | assay_label |
|---|---|---|
| mg | se_mg | Metagenomic |
| qPCR | se_qPRC | qPCR |
| amplicon | se_16S | 16S rRNA amplicon seq. |
| luminex | se_luminex | Luminex |
| flow | se_flow | Flow cytometry |
mae_coldata_assays tablesavailable_samples <-
map(
1:nrow(assay_list),
function(i, assay_list){
se <- get(assay_list$se_object_name[i])
tibble(
assay = assay_list$mae_assay_name[i],
se_colname = se |> colnames(),
uid = se$uid,
sample_type = se$sample_type,
control_type = se$control_type,
)
},
assay_list = assay_list
) |>
bind_rows()
if (!all(available_samples$se_colname == available_samples$uid, na.rm = TRUE)) {
stop("The uid column in the SE objects does not match their colnames")
}
mae_coldata_assays <-
available_samples |>
group_by(uid) |>
summarize(
n_sample_type = n_distinct(sample_type),
n_control_type = n_distinct(control_type),
sample_type = sample_type[1],
control_type = control_type[1],
assays = str_c(assay, collapse = ", ")
)
if (any(mae_coldata_assays$n_sample_type > 1)) {
warning("Some samples have multiple sample types")
}
if (any(mae_coldata_assays$n_control_type > 1)) {
warning("Some samples have multiple control types")
}
mae_coldata_assays <-
mae_coldata_assays |>
select(uid, sample_type, control_type, assays) The mae_coldata_assays table contains the following columns:
mae_coldata_assays |> colnames()[1] "uid" "sample_type" "control_type" "assays"
mae_coldata_assays |> str()tibble [4,184 × 4] (S3: tbl_df/tbl/data.frame)
$ uid : chr [1:4184] "068100004_0000" "068100004_1000" "068100004_1001" "068100004_1002" ...
$ sample_type : chr [1:4184] "Clinical sample" "Clinical sample" "Clinical sample" "Clinical sample" ...
$ control_type: chr [1:4184] "" "" "" "" ...
$ assays : chr [1:4184] "mg, qPCR, amplicon, luminex" "mg, qPCR, amplicon, luminex, flow" "amplicon" "amplicon" ...
mae_coldata_assays |>
dplyr::count(sample_type, control_type) |>
gt("Number of samples per sample type and control type") | sample_type | control_type | n |
|---|---|---|
| Clinical sample | 3962 | |
| Clinical sample (other study) | 7 | |
| Negative control | Nuclease-free water | 46 |
| Negative control | Unused swab + C2 | 47 |
| Positive control | Mock 1 | 57 |
| Positive control | Mock 2 | 57 |
| Test sample | 8 |
We also remove the columns sample_type and control_type from the colData of the individual assays to avoid conflicts when joining with the mae@colData table in downstream analyses.
for (i in 1:nrow(assay_list)) {
se <- get(assay_list$se_object_name[i])
if (("sample_type" %in% colnames(colData(se))) | ("control_type" %in% colnames(colData(se)))) {
colData(se) <- colData(se)[, !colnames(colData(se)) %in% c("sample_type", "control_type")]
}
assign(assay_list$se_object_name[i], se)
}MAE@colData: joining the visits and mae_coldata_assays tablesWe first do a full join of the visits and mae_coldata_assays tables by uid. This will allow us to keep all visits, even those that do not have assay data, and to identify which visits are in the CRF data and which are in the assay data.
mae_coldata <-
# we first do a full join of the CRF visits and mae_coldata_assays by uids;
dplyr::full_join(
visits |> select(-starts_with("specimen_collection")) |> mutate(in_CRF = TRUE),
mae_coldata_assays |> mutate(in_assays = TRUE) |> select(in_assays, everything()),
by = join_by(uid)
) |>
mutate(
in_CRF = in_CRF |> replace_na(FALSE),
in_assays = in_assays |> replace_na(FALSE)
)# We notice that there are 5 "Clinical sample" entries for which we do have assay data but that did not appear in the CRF data:
mae_coldata |>
filter(is.na(visit_code), sample_type == "Clinical sample")
# we fix these belowmae_coldata <-
mae_coldata |>
mutate(
needs_fixing = is.na(pid) & (sample_type == "Clinical sample") & str_detect(uid, "[0-9]{9}_[0-9]{4}"),
pid =
case_when(
needs_fixing ~ str_remove(uid, "_[0-9]{4}"),
TRUE ~ pid
),
visit_code =
case_when(
needs_fixing ~ str_remove(uid, "[0-9]{9}_"),
TRUE ~ visit_code
),
study_day =
case_when(
needs_fixing & (visit_code %in% c("0001", "0011")) ~ -7,
needs_fixing & in_assays & (visit_code == "1000") ~ 0,
TRUE ~ study_day
),
visit_attended =
case_when(
is.na(visit_attended) & needs_fixing ~ TRUE,
TRUE ~ visit_attended
),
visit_planned =
case_when(
is.na(visit_planned) & needs_fixing ~ "Unplanned visit",
TRUE ~ visit_planned
),
visit_type =
case_when(
is.na(visit_type) & needs_fixing ~ "Clinic",
TRUE ~ visit_type
)
) We then merge that table with the participants table to add the participant-invariant information (e.g., randomized, mITT, etc.) to the mae_coldata table.
# we add the participant-invariant information from the `participants` table
mae_coldata <-
mae_coldata |>
dplyr::left_join(
participants,
by = join_by(pid)
)mae_coldata <- mae_coldata |> select(-needs_fixing)mae_coldata |>
dplyr::count(in_CRF, randomized, in_assays, sample_type, assays) |>
gt()| in_CRF | randomized | in_assays | sample_type | assays | n |
|---|---|---|---|---|---|
| FALSE | FALSE | TRUE | Clinical sample | amplicon | 1 |
| FALSE | FALSE | TRUE | Clinical sample | mg, qPCR, amplicon, flow | 2 |
| FALSE | NA | TRUE | Clinical sample (other study) | mg, qPCR | 7 |
| FALSE | NA | TRUE | Negative control | amplicon | 70 |
| FALSE | NA | TRUE | Negative control | mg, qPCR | 12 |
| FALSE | NA | TRUE | Negative control | qPCR | 11 |
| FALSE | NA | TRUE | Positive control | amplicon | 91 |
| FALSE | NA | TRUE | Positive control | mg | 1 |
| FALSE | NA | TRUE | Positive control | mg, qPCR | 10 |
| FALSE | NA | TRUE | Positive control | qPCR | 12 |
| FALSE | NA | TRUE | Test sample | qPCR | 8 |
| TRUE | FALSE | FALSE | NA | NA | 414 |
| TRUE | FALSE | TRUE | Clinical sample | mg, qPCR, amplicon | 66 |
| TRUE | FALSE | TRUE | Clinical sample | mg, qPCR, amplicon, luminex | 5 |
| TRUE | FALSE | TRUE | Clinical sample | qPCR, amplicon | 1 |
| TRUE | TRUE | FALSE | NA | NA | 864 |
| TRUE | TRUE | TRUE | Clinical sample | amplicon | 2989 |
| TRUE | TRUE | TRUE | Clinical sample | amplicon, luminex | 1 |
| TRUE | TRUE | TRUE | Clinical sample | flow | 1 |
| TRUE | TRUE | TRUE | Clinical sample | luminex, flow | 1 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, amplicon | 49 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, amplicon, flow | 3 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, amplicon, luminex | 491 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, amplicon, luminex, flow | 348 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, luminex | 2 |
| TRUE | TRUE | TRUE | Clinical sample | qPCR, amplicon, luminex | 1 |
| TRUE | TRUE | TRUE | Clinical sample | qPCR, amplicon, luminex, flow | 1 |
study_day for daily visit_code that did not have CRF dataThere were a few daily swabs with daily visit_code that did not have CRF data, and consequently did not have study days assigned.
We impute the missing values using the study_days from “surrounding” visits.
# study_days_for_daily_codes <-
# tibble(study_day = (1:(7*6)) -1) |>
# mutate(
# visit_code = 1000 + ((study_day %% 7) + 1) + (study_day %/% 7) * 100,
# visit_code = visit_code |> as.character()
# ) |>
# dplyr::rename(exp_study_day = study_day) |>
# select(visit_code, exp_study_day)
study_days_for_daily_codes <-
mae_coldata |>
dplyr::filter(visit_type == "Home") |>
dplyr::count(visit_code, study_day) |>
arrange(visit_code, -n) |>
group_by(visit_code) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::rename(exp_study_day = study_day) |>
select(-n)study_days_for_daily_codes |>
ggplot(aes(x = exp_study_day, y = visit_code)) +
geom_point()study_days_for_daily_codes <-
study_days_for_daily_codes |>
mutate(
exp_study_day =
case_when(
is.na(exp_study_day) & (visit_code %in% c(1501:1503)) ~ as.numeric(visit_code) - 1501 + 36,
TRUE ~ exp_study_day
)
)pid_with_missing_daily_study_days <-
mae_coldata |>
filter(is.na(study_day), in_assays) |>
select(pid) |> distinct() |>
left_join(mae_coldata, by = join_by(pid)) |>
arrange(pid, visit_code) |>
left_join(study_days_for_daily_codes) |>
mutate(diff = study_day - exp_study_day) |>
group_by(pid) |>
fill(diff, .direction = "downup") |>
ungroup() |>
mutate(
fixed_study_day =
case_when(
is.na(study_day) & !is.na(exp_study_day) ~ exp_study_day + diff,
TRUE ~ study_day
)
) |>
select(uid, pid, visit_code, study_day, fixed_study_day, exp_study_day, diff, in_CRF, in_assays, crf_plates, assays)
# pid_with_missing_daily_study_days |> View()mae_coldata <-
mae_coldata |>
left_join(pid_with_missing_daily_study_days |> select(uid, fixed_study_day), by = join_by(uid)) |>
mutate(
study_day =
case_when(
is.na(study_day) ~ fixed_study_day,
TRUE ~ study_day
)
) |>
select(-fixed_study_day)mae_coldata |>
filter(is.na(study_day), in_assays) |>
View()
# all is fixedvisit, visit_number, and study_week columnsvisit_dict <-
bind_rows(
tibble(
visit = "screening",
definition = "Screening visit for which we have the most assay data. If two screening visits have the same number of assays, we take the last one.",
possible_visit_codes = "0000, 0001, 0010, 0011",
visit_number = "V0",
study_week = -1
),
tibble(
visit = "add. screening",
definition = 'If a participant has data for two screening visits, one is the "main" screening visit, and the other is labelled as an additional screening visit.',
possible_visit_codes = "0000, 0001, 0010, 0011",
visit_number = "V0b",
study_week = -2
),
tibble(
visit = "pre-MTZ",
definition = "Visit before the metronidazole treatment (MTZ) starts.",
possible_visit_codes = "1000, 1011",
visit_number = "V1",
study_week = 0
),
tibble(
visit = "post-MTZ",
definition = "Visit after the metronidazole treatment (MTZ) ends.",
possible_visit_codes = "1100",
visit_number = "V2",
study_week = 1
),
tibble(
visit = "post-INT",
definition = "Visit after the intervention (INT) ends.",
possible_visit_codes = "1200, 1211",
visit_number = "V3",
study_week = 2
),
tibble(
visit = "1w post-INT",
definition = "Visit one week after the intervention (INT) ends.",
possible_visit_codes = "1300",
visit_number = "V4",
study_week = 3
),
tibble(
visit = "2w post-INT",
definition = "Visit two weeks after the intervention (INT) ends.",
possible_visit_codes = "1400",
visit_number = "V5",
study_week = 4
),
tibble(
visit = "3w post-INT",
definition = "Visit three weeks after the intervention (INT) ends.",
possible_visit_codes = "1500, 1511, 1512",
visit_number = "V6",
study_week = 5
),
tibble(
visit = "5w post-INT",
definition = "Visit five weeks after the intervention (INT) ends.",
possible_visit_codes = "1700, 1711",
visit_number = "V7",
study_week = 7
),
tibble(
visit = "7w post-INT",
definition = "Visit seven weeks after the intervention (INT) ends.",
possible_visit_codes = "1900, 1911",
visit_number = "V8",
study_week = 9
),
tibble(
visit = "10w post-INT",
definition = "Visit ten weeks after the intervention (INT) ends.",
possible_visit_codes = "2120, 2121, 2122",
visit_number = "V9",
study_week = 12
)
) |>
mutate(
visit = visit |> fct_reorder(study_week)
)visit_dict |>
gt() |>
tab_header(
title = "Visit codes and definitions"
) |>
cols_label(
visit = "Visit",
definition = "Definition",
possible_visit_codes = "Possible visit codes",
visit_number = "Visit number",
study_week = "Study week"
) |>
cols_align(
align = "left", columns = c(visit, definition, possible_visit_codes)
)| Visit codes and definitions | ||||
|---|---|---|---|---|
| Visit | Definition | Possible visit codes | Visit number | Study week |
| screening | Screening visit for which we have the most assay data. If two screening visits have the same number of assays, we take the last one. | 0000, 0001, 0010, 0011 | V0 | -1 |
| add. screening | If a participant has data for two screening visits, one is the "main" screening visit, and the other is labelled as an additional screening visit. | 0000, 0001, 0010, 0011 | V0b | -2 |
| pre-MTZ | Visit before the metronidazole treatment (MTZ) starts. | 1000, 1011 | V1 | 0 |
| post-MTZ | Visit after the metronidazole treatment (MTZ) ends. | 1100 | V2 | 1 |
| post-INT | Visit after the intervention (INT) ends. | 1200, 1211 | V3 | 2 |
| 1w post-INT | Visit one week after the intervention (INT) ends. | 1300 | V4 | 3 |
| 2w post-INT | Visit two weeks after the intervention (INT) ends. | 1400 | V5 | 4 |
| 3w post-INT | Visit three weeks after the intervention (INT) ends. | 1500, 1511, 1512 | V6 | 5 |
| 5w post-INT | Visit five weeks after the intervention (INT) ends. | 1700, 1711 | V7 | 7 |
| 7w post-INT | Visit seven weeks after the intervention (INT) ends. | 1900, 1911 | V8 | 9 |
| 10w post-INT | Visit ten weeks after the intervention (INT) ends. | 2120, 2121, 2122 | V9 | 12 |
We first determine, for each participant, which is their main vs. their additional screening visit.
mae_coldata <-
mae_coldata |>
group_by(pid) |>
mutate(
is_screening_visit =
case_when(
visit_code %in% c("0000", "0001", "0010", "0011") ~ TRUE,
TRUE ~ FALSE
),
n_assays = ifelse(in_assays, str_split(assays, ", ") |> lengths(), 0),
screening_visit_score = is_screening_visit * (n_assays + study_day/10)
) |>
arrange(-screening_visit_score) |>
mutate(
screening_visit =
case_when(
is_screening_visit & (row_number() == 1) ~ "main screening",
is_screening_visit & (row_number() > 1) ~ "additional screening",
TRUE ~ NA_character_
)
) |>
ungroup() |>
select(-is_screening_visit, -n_assays, -screening_visit_score)mae_coldata |>
dplyr::filter(screening_visit == "additional screening") |>
select(pid) |> distinct() |>
left_join(mae_coldata) |>
arrange(randomized |> desc(), pid, visit_code) |>
select(randomized, pid, visit_code, study_day, crf_plates, assays, screening_visit) |>
View()
mae_coldata |>
dplyr::filter(screening_visit == "additional screening") |>
select(pid) |> distinct() |>
left_join(mae_coldata) |>
arrange(randomized |> desc(), pid, visit_code) |>
select(randomized, pid, visit_code, study_day, crf_plates, assays, screening_visit) |>
dplyr::filter(randomized, (screening_visit == "main screening"), study_day == 0) |>
select(pid) |> distinct() |>
left_join(mae_coldata) |>
arrange(randomized |> desc(), pid, visit_code) |>
select(randomized, mITT, pid, visit_code, study_day, crf_plates, assays, screening_visit) |>
View()
# > check 068200020
# unclear - her main screening visit could be at day -3, but it might be at day 0, which would mean that her screening visit and pre-MTZ visit samples were collected on the same day
crf_raw$crf35 |> dplyr::filter(pid == "068200020") |> View()mae_coldata |>
filter(!is.na(screening_visit)) |>
dplyr::count(screening_visit) # A tibble: 2 × 2
screening_visit n
<chr> <int>
1 additional screening 66
2 main screening 532
We next join the mae_coldata table with the visit_dict table to add the visit, visit_number, and study_week columns.
mae_coldata <-
mae_coldata |>
select(-any_of(c("PIPV", "visit", "visit_number", "study_week"))) |>
left_join(
visit_dict |>
mutate(visit_code = possible_visit_codes |> str_split(", ")) |>
unnest(visit_code) |>
mutate(
screening_visit =
case_when(
visit_number == "V0b" ~ "additional screening",
visit_number == "V0" ~ "main screening",
TRUE ~ NA_character_
),
PIPV = visit_number != "V0b"
) |>
select(visit_code, screening_visit, PIPV, visit, visit_number, study_week)
) |>
mutate(PIPV = PIPV |> replace_na(FALSE)) |>
relocate(
PIPV, visit, visit_number, study_week, .after = "visit_code"
) We check that study_days are unique for each visit_number at which assay samples were collected for each participant.
mae_coldata |>
group_by(pid, visit_number) |>
mutate(n_study_day = n_distinct(study_day[!is.na(assays)])) |>
ungroup() |>
filter(PIPV, n_study_day > 1) |>
select(uid, pid, visit_code, study_day, assays) |>
group_by(pid) |>
gt(row_group_as_column = TRUE)| uid | visit_code | study_day | assays | |
|---|---|---|---|---|
| 068200062 | 068200062_1500 | 1500 | 37 | mg, qPCR, amplicon |
| 068200062_1511 | 1511 | 44 | luminex, flow | |
| 068200087 | 068200087_1500 | 1500 | 40 | mg, qPCR, amplicon, luminex |
| 068200087_1511 | 1511 | 43 | flow |
There are just two participants for which the dates don’t match and the different assays were collected at different dates. This may be important for integrative analyses later, so we leave as is for now.
specimen_collection_swab (collected, not collected, unclear)specimen_collection_softcup (collected, not collected, unclear)specimen_collection_cytobrush (collected, not collected, unclear)mgqPCRampliconluminexflowsample_and_visit_info_assay <-
mae_coldata |>
select(uid, sample_type, in_CRF, in_assays, randomized)
sample_and_visit_info_assay <-
sample_and_visit_info_assay |>
dplyr::left_join(
visits |> select(uid, starts_with("specimen_collection")),
by = join_by(uid)
)
sample_and_visit_info_assay <-
sample_and_visit_info_assay |>
expand_grid(assay = assay_list$mae_assay_name) |>
dplyr::left_join(
available_samples |>
select(uid, assay) |>
mutate(value = TRUE),
by = join_by(uid, assay)
) |>
mutate(value = value |> replace_na(FALSE)) |>
pivot_wider(names_from = assay, values_from = value)
# sample_and_visit_info_assay <-
# sample_and_visit_info_assay |>
# mutate(
# mg_category =
# case_when(
# !str_detect(sample_type, "Clinical sample") & mg ~ "Not a clinical sample",
# is.na(randomized) & mg ~ "???",
# !is.na(randomized) & !randomized & mg ~ "Additional sample from non-randomized participants",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & mg ~ "Expected data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & mg ~ "Unexpected data",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !mg ~ "Missing data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !mg ~ "Sample not collected",
# TRUE ~ "ERROR"
# ),
# qPCR_category =
# case_when(
# !str_detect(sample_type, "Clinical sample") & qPCR ~ "Not a clinical sample",
# is.na(randomized) & qPCR ~ "???",
# !is.na(randomized) & !randomized & qPCR ~ "Additional sample from non-randomized participants",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & qPCR ~ "Expected data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & qPCR ~ "Unexpected data",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !qPCR ~ "Missing data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !qPCR ~ "Sample not collected",
# TRUE ~ "ERROR"
# ),
# amplicon_category =
# case_when(
# !str_detect(sample_type, "Clinical sample") & amplicon ~ "Not a clinical sample",
# is.na(randomized) & amplicon ~ "???",
# !is.na(randomized) & !randomized & amplicon ~ "Additional sample from non-randomized participants",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & amplicon ~ "Expected data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & amplicon ~ "Unexpected data",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !amplicon ~ "Missing data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !amplicon ~ "Sample not collected",
# TRUE ~ "ERROR"
# ),
# luminex_category =
# case_when(
# !str_detect(sample_type, "Clinical sample") & luminex ~ "Not a clinical sample",
# is.na(randomized) & luminex ~ "???",
# !is.na(randomized) & !randomized & luminex ~ "Additional sample from non-randomized participants",
# !is.na(specimen_collection_softcup) & (specimen_collection_softcup != "not collected") & luminex ~ "Expected data",
# (is.na(specimen_collection_softcup) | (specimen_collection_softcup == "not collected")) & luminex ~ "Unexpected data",
# !is.na(specimen_collection_softcup) & (specimen_collection_softcup == "collected") & !luminex ~ "Missing data",
# (is.na(specimen_collection_softcup) | (specimen_collection_softcup == "not collected")) & !luminex ~ "Sample not collected",
# TRUE ~ "ERROR"
# )
# )
#
# The last 5 columns can take the following values:
#
# - `expected` (swab collected and available data),
# - `unexpected` (swab not collected but available data),
# - `missing` (swab collected but no available data),
# - `maybe missing` (unclear if swab was collected and no available data),
# - `not collected` (swab not collected),
# - `additional` (swab collected from a non-randomized participant),
# - `NA` (not collected, no available data)
# se_sample_and_visit_info_assay <-
SummarizedExperiment(
assays = list(
sample_and_visit_info =
sample_and_visit_info_assay |>
select(-c(sample_type, in_CRF, in_assays, randomized)) |>
as.data.frame() |>
column_to_rownames("uid") |>
t()
),
colData =
sample_and_visit_info_assay |> select(uid) |>
mutate(rownames = uid) |> as.data.frame() |>
column_to_rownames("rownames")
)MultiAssayExperiment objectSE_list <-
list(
visit_and_sample_info = se_sample_and_visit_info_assay,
mg = se_mg,
qPCR = se_qPRC,
amplicon = se_16S,
luminex = se_luminex,
flow = se_flow
)
mae <-
MultiAssayExperiment::MultiAssayExperiment(
experiments = MultiAssayExperiment::ExperimentList(SE_list),
colData =
mae_coldata |> select(-in_CRF, -in_assays) |>
as.data.frame() |> mutate(rownames = uid) |>
column_to_rownames("rownames"),
metadata = list(
study = "VIBRANT",
data_source = "actual study data (not simulated)",
data_status = "partially QCed",
date = today(),
colData_dictionary = participants_variable_dictionary,
crf_plates_description = crf_plates_dictionary,
crf_data_raw = crf_raw,
crf_data_clean = crf_clean,
participant_crfs_merged = participant_crfs_merged,
visits_crfs_merged = visits_crfs_merged,
exposures = exposures
)
)maeA MultiAssayExperiment object of 6 listed
experiments with user-defined names and respective classes.
Containing an ExperimentList class object of length 6:
[1] visit_and_sample_info: SummarizedExperiment with 8 rows and 5462 columns
[2] mg: SummarizedExperiment with 779 rows and 996 columns
[3] qPCR: SummarizedExperiment with 16 rows and 1029 columns
[4] amplicon: SummarizedExperiment with 817 rows and 4119 columns
[5] luminex: SummarizedExperiment with 48 rows and 850 columns
[6] flow: SummarizedExperiment with 11 rows and 356 columns
Functionality:
experiments() - obtain the ExperimentList instance
colData() - the primary/phenotype DataFrame
sampleMap() - the sample coordination DataFrame
`$`, `[`, `[[` - extract colData columns, subset, or experiment
*Format() - convert into a long or wide DataFrame
assays() - convert ExperimentList to a SimpleList of matrices
exportClass() - save data to flat files
mae_sample_map <-
mae@sampleMap |> as_tibble() |> dplyr::filter(assay %in% assay_list$mae_assay_name) |>
select(uid = primary, assay) |>
arrange(uid) |>
dplyr::left_join(assay_list |> dplyr::rename(assay = mae_assay_name), by = join_by(assay)) |>
dplyr::right_join(
mae@colData |> as.data.frame() |> as_tibble() |>
select(uid, pid, visit_code, PIPV, visit, visit_type, sample_type, location, randomized, arm),
by = join_by(uid)
)All samples
mae_sample_map |>
dplyr::filter(!is.na(assay)) |>
group_by(uid) |> mutate(n = n()) |> ungroup() |>
mutate(uid = uid |> fct_reorder(-n)) |>
ggplot() +
aes(x = uid, y = assay_label |> fct_rev(), fill = assay_label) +
geom_tile() +
facet_grid(. ~ sample_type, scales = "free_x", space = "free_x") +
scale_x_discrete("Sample IDs", breaks = NULL) +
ylab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)Longitudinal availability for all visits
mae_sample_map |>
dplyr::filter(!is.na(assay)) |>
# filter(visit_code %in% c("0000", seq(1000, 1900, by = 100), "2120")) |>
ggplot() +
aes(x = assay_label, y = pid, fill = assay_label) +
geom_tile() +
facet_grid(
ifelse(randomized, "Randomized", "Not randomized") + location ~ visit_code,
scales = "free_y", space = "free_y"
) + # sample_category
scale_y_discrete("Participants IDs") + #, breaks = "NULL") +
xlab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)For the weekly visits of all participants
mae_sample_map |>
dplyr::filter(!is.na(assay)) |>
dplyr::filter(visit_type == "Clinic") |>
ggplot() +
aes(x = assay_label, y = pid, fill = assay_label) +
geom_tile() +
facet_grid(
location + ifelse(randomized, "Randomized", "Not randomized") + arm ~ visit_code,
scales = "free_y", space = "free_y"
) + # sample_category
scale_y_discrete("Participants IDs") + #, breaks = "NULL") +
xlab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)For the weekly visits of randomized participants
mae_sample_map |>
dplyr::filter(randomized) |>
dplyr::filter(!is.na(assay)) |>
dplyr::filter(visit_type == "Clinic") |>
ggplot() +
aes(x = assay_label, y = pid, fill = assay_label) +
geom_tile() +
facet_grid(
location + ifelse(randomized, "Randomized", "Not randomized") + arm ~ visit_code,
scales = "free_y", space = "free_y"
) + # sample_category
scale_y_discrete("Participants IDs") + #, breaks = "NULL") +
xlab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)mae_sample_map |>
dplyr::filter(randomized) |>
dplyr::filter(!is.na(assay)) |>
dplyr::filter(PIPV) |>
ggplot() +
aes(x = assay_label, y = pid, fill = assay_label) +
geom_tile() +
facet_grid(
location + arm ~ visit,
scales = "free_y", space = "free_y"
) + # sample_category
scale_y_discrete("Participants IDs") + #, breaks = "NULL") +
xlab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)MultiAssayExperiment objectsWe save the “master” MAE (with all assays); but if needed for publication, we can subset the MAE to only include the assays/data of interest.
saveRDS(
mae,
str_c(
get_02_output_dir(),
"mae_full_", today() |> str_remove_all("-"), ".rds"
)
)